\contentsline {chapter}{Executive Summary}{i}{chapter*.1}
\vspace {1em}
\contentsline {chapter}{List of Figures}{vii}{dummy.4}
\contentsline {chapter}{List of Tables}{ix}{dummy.6}
\contentsline {chapter}{\numberline {1}Motivation}{1}{chapter.8}
\contentsline {section}{\numberline {1.1}The Problem}{1}{section.9}
\contentsline {section}{\numberline {1.2}Currently Available Services}{2}{section.14}
\contentsline {subsection}{\numberline {1.2.1}Specialised Local News Web Sites}{3}{subsection.16}
\contentsline {subsection}{\numberline {1.2.2}Manually Edited Portals}{3}{subsection.17}
\contentsline {subsection}{\numberline {1.2.3}`Hyperlocal' Information Services}{3}{subsection.19}
\contentsline {subsection}{\numberline {1.2.4}News Aggregators}{4}{subsection.22}
\contentsline {section}{\numberline {1.3}Proposed Solution}{4}{section.28}
\contentsline {section}{\numberline {1.4}Project Aims}{5}{section.29}
\contentsline {chapter}{\numberline {2}Background}{7}{chapter.30}
\contentsline {section}{\numberline {2.1}News Aggregation}{7}{section.31}
\contentsline {section}{\numberline {2.2}RSS Feeds}{8}{section.34}
\contentsline {section}{\numberline {2.3}Tree Representation of Web Pages}{8}{section.35}
\contentsline {section}{\numberline {2.4}Web Content Extraction}{8}{section.39}
\contentsline {section}{\numberline {2.5}Text Classification}{9}{section.42}
\contentsline {chapter}{\numberline {3}Technical Basis}{11}{chapter.43}
\contentsline {section}{\numberline {3.1}Online Content Monitoring}{11}{section.44}
\contentsline {subsection}{\numberline {3.1.1}Web crawling}{12}{subsection.45}
\contentsline {subsection}{\numberline {3.1.2}RSS Feed Monitoring}{12}{subsection.46}
\contentsline {section}{\numberline {3.2}Article Content Extraction}{12}{section.48}
\contentsline {subsection}{\numberline {3.2.1}Visual Wrapper Approach}{13}{subsection.49}
\contentsline {subsection}{\numberline {3.2.2}Extraction Using Tree-Edit Distance}{13}{subsection.51}
\contentsline {subsection}{\numberline {3.2.3}Perception-Oriented News Extraction}{14}{subsection.53}
\contentsline {subsection}{\numberline {3.2.4}Tree Analysis using RSS Extract}{14}{subsection.54}
\contentsline {section}{\numberline {3.3}Document Representation}{15}{section.57}
\contentsline {subsection}{\numberline {3.3.1}Bag of Words}{15}{subsection.59}
\contentsline {subsection}{\numberline {3.3.2}Bag of Concepts}{16}{subsection.63}
\contentsline {section}{\numberline {3.4}Term Weighting in a Feature Space}{16}{section.64}
\contentsline {subsection}{\numberline {3.4.1}Binary}{16}{subsection.65}
\contentsline {subsection}{\numberline {3.4.2}Term Frequency}{16}{subsection.66}
\contentsline {subsection}{\numberline {3.4.3}TF-IDF Weighting}{16}{subsection.67}
\contentsline {section}{\numberline {3.5}Feature Selection}{17}{section.69}
\contentsline {subsection}{\numberline {3.5.1}Stop-Word Removal}{17}{subsection.70}
\contentsline {subsection}{\numberline {3.5.2}Word Stemming}{17}{subsection.71}
\contentsline {subsection}{\numberline {3.5.3}Document Frequency}{17}{subsection.73}
\contentsline {subsection}{\numberline {3.5.4}Information Gain}{18}{subsection.74}
\contentsline {section}{\numberline {3.6}Text Classification}{18}{section.75}
\contentsline {subsection}{\numberline {3.6.1}Support Vector Machines}{18}{subsection.76}
\contentsline {subsection}{\numberline {3.6.2}Other Methods}{22}{subsection.96}
\contentsline {section}{\numberline {3.7}Topic Categorisation}{23}{section.101}
\contentsline {subsection}{\numberline {3.7.1}Link Analysis}{23}{subsection.102}
\contentsline {subsection}{\numberline {3.7.2}Content Analysis}{24}{subsection.104}
\contentsline {section}{\numberline {3.8}Location Categorisation}{24}{section.105}
\contentsline {subsection}{\numberline {3.8.1}Source Analysis}{24}{subsection.106}
\contentsline {subsection}{\numberline {3.8.2}Location Extraction}{24}{subsection.107}
\contentsline {subsection}{\numberline {3.8.3}Content Analysis}{25}{subsection.109}
\contentsline {section}{\numberline {3.9}Finding Neighbouring Locations}{26}{section.113}
\contentsline {subsection}{\numberline {3.9.1}Location Database}{26}{subsection.115}
\contentsline {chapter}{\numberline {4}Project Execution}{29}{chapter.118}
\contentsline {section}{\numberline {4.1}Development Approach}{29}{section.119}
\contentsline {subsection}{\numberline {4.1.1}Language Selection}{29}{subsection.120}
\contentsline {subsection}{\numberline {4.1.2}Content Revision}{29}{subsection.123}
\contentsline {section}{\numberline {4.2}Product Overview}{30}{section.125}
\contentsline {section}{\numberline {4.3}Collection of news articles}{30}{section.127}
\contentsline {subsection}{\numberline {4.3.1}Collection Process}{31}{subsection.128}
\contentsline {section}{\numberline {4.4}Selection and Classification of Acquired Articles}{32}{section.132}
\contentsline {subsection}{\numberline {4.4.1}Scope}{32}{subsection.133}
\contentsline {subsection}{\numberline {4.4.2}Selection Process}{32}{subsection.136}
\contentsline {subsection}{\numberline {4.4.3}Classification Using Support Vector Machines}{34}{subsection.142}
\contentsline {subsection}{\numberline {4.4.4}Training Process}{34}{subsection.144}
\contentsline {subsection}{\numberline {4.4.5}Classification Process}{35}{subsection.151}
\contentsline {section}{\numberline {4.5}Query Expansion}{35}{section.154}
\contentsline {subsection}{\numberline {4.5.1}Location Storage}{36}{subsection.155}
\contentsline {section}{\numberline {4.6}Rendering}{36}{section.159}
\contentsline {chapter}{\numberline {5}Evaluation}{37}{chapter.160}
\contentsline {section}{\numberline {5.1}Article Content Extraction}{37}{section.161}
\contentsline {subsection}{\numberline {5.1.1}Limitations of News Extraction Technique}{38}{subsection.167}
\contentsline {section}{\numberline {5.2}Text Classification}{39}{section.172}
\contentsline {subsection}{\numberline {5.2.1}Testing Methods}{39}{subsection.173}
\contentsline {subsection}{\numberline {5.2.2}Kernel Selection}{39}{subsection.174}
\contentsline {subsection}{\numberline {5.2.3}Parameter Selection}{40}{subsection.176}
\contentsline {subsection}{\numberline {5.2.4}Feature Selection}{41}{subsection.178}
\contentsline {subsection}{\numberline {5.2.5}Size of Training Set}{42}{subsection.188}
\contentsline {subsection}{\numberline {5.2.6}Topic Categorisation}{44}{subsection.190}
\contentsline {subsection}{\numberline {5.2.7}Location Ranking}{45}{subsection.192}
\contentsline {subsection}{\numberline {5.2.8}Limitations of SVM approach}{46}{subsection.197}
\contentsline {subsection}{\numberline {5.2.9}Scalability}{46}{subsection.198}
\contentsline {section}{\numberline {5.3}Query Expansion}{47}{section.200}
\contentsline {chapter}{\numberline {6}Conclusion}{49}{chapter.202}
\contentsline {section}{\numberline {6.1}Future Work}{50}{section.203}
\vspace {2em}
\contentsline {chapter}{\numberline {A}Screenshot of AlterNews.com}{53}{appendix.206}
\contentsline {chapter}{\numberline {B}Results Tables}{55}{appendix.208}
\vspace {2em}
\contentsline {chapter}{Bibliography}{57}{dummy.211}
